#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "01_validate_statement_segmentation.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---

# load required R packages
library(quanteda) # CRAN v3.3.1
library(dplyr)    # CRAN v1.1.2
library(stringr)  # CRAN v1.5.0
library(ggplot2)  # CRAN v3.4.2

source("function_theme_base.R")

# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/


# print output of sessionInfo()
sessionInfo()

# load corpus of manifestos
corp <- readRDS("data_corpus_japmanifestos.rds")

# specify patterns for statement segmentation
pattern_statement <- "[.?!！•–。●■○・]"

# reshape corpus
corp <- corp |> 
    corpus_segment(pattern = pattern_statement, valuetype = "regex",
                   pattern_position = "after")

# change docvars
docvars(corp, "document") <- docnames(corp)
docvars(corp, "ntoken") <- ntoken(corp)

docvars(corp, "document_clean") <- gsub("\\..*", "", docvars(corp, "document"))

# Load data frame with 60 manifestos that have been manually segmented
dat_manual <- readRDS("data_manifestos_manual_segmentation.rds")

# get doc_ids
docs_manual <- unique(dat_manual$doc_id)

# only select documents that haven been handcoded
corp_automated_select <- corp |> 
    corpus_subset(document_clean %in% docs_manual)

# only keep sentences with at least 1 token
corp_automated_select <- corp_automated_select |> 
    corpus_subset(ntoken(corp_automated_select) > 2)

# use "classic" sentence-segmentation for manually 
# segmented sentences (using full stops)
corp_manual <- corpus(dat_manual) |> 
    corpus_reshape(to = "sentences")

# only select sentences longer than 1 token
corp_manual <- corp_manual |> 
    corpus_subset(ntoken(corp_manual) > 2)

# retrieve document-level variables
dat_docvars_manual <- docvars(corp_manual)
dat_docvars_manual$document <- docnames(corp_manual)

# make document name clean
dat_docvars_manual$document_clean <- gsub("\\..*", "", 
                                          dat_docvars_manual$document)

dat_docvars_automated <- docvars(corp_automated_select)

# count number of sentences using manual approach
dat_docvars_manual_grouped <- dat_docvars_manual |> 
    group_by(document_clean) |> 
    summarise(n_sentences_manual = n())

# count number of sentences using automated approach
dat_docvars_automated_grouped <- dat_docvars_automated |> 
    group_by(document_clean) |> 
    summarise(n_sentences_automated = n())

# merge manual and automated codings
dat_compare <- left_join(dat_docvars_manual_grouped, 
                         dat_docvars_automated_grouped, 
                         by = "document_clean")

# use name of document to identify election
dat_compare <- dat_compare |> 
    mutate(election = case_when(
        str_detect(document_clean, "03") ~ "2003",
        str_detect(document_clean, "05") ~ "2005",
        str_detect(document_clean, "12") ~ "2012",
        str_detect(document_clean, "14") ~ "2014"
    ))

summary(dat_compare$n_sentences_manual)

# calculate correlation
dat_cor <- dat_compare |> 
    summarise(cor = paste0("r=", round(cor(n_sentences_manual, n_sentences_automated), 2)))

# create scatterplot (Figure A4) ----
ggplot(dat_compare, aes(x = n_sentences_automated,
                        y = n_sentences_manual)) + 
    geom_smooth(method = "lm") +
    geom_point(size = 3, shape = 1) +
    annotate("text", x = 15, y = 85, label = dat_cor$cor, size = 5, colour = "grey40") +
    labs(x = "Number of Statements (Automated Segmentation)", 
         y = "Number of Statements (Manual Segmentation)")
ggsave("fig_a04.pdf", 
       width = 5, height = 5)
